1 Summary

  • how to process NAs
  • empty attributes (zeros)
  • correlation pearson

2 Analysis

2.1 Used R libraries

  • psych - for “describe” function
  • ggplot2 - for visualisations
  • dplyr - for human-friendly data processing
  • friendlyeval - for dynamic column selection while dlypr processing
  • reshape2 - for very usefull melt function (R implementation is very slow!)
  • shiny - for dynamic correlation chart
  • DT - for dynamic tables
  • archive - for 7z extraction
  • grid,gridExtra - for plot grid

2.2 Initialization code

set.seed(23) # for random functions
prettyTable <- function(table_df, round_columns=numeric(), round_digits=2) {
    DT::datatable(table_df, style="bootstrap", filter = "top", rownames = FALSE, extensions = "Buttons", options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print'))) %>%
    formatRound(round_columns, round_digits)
} # same look of tables

2.3 Data loading

2.3.1 Downloading dataset

tempFilePath <- paste(working_dir,"/../data/temp.7z",sep='')
dataFilePath <- paste(working_dir,"/../data/all_summary.csv",sep='')
if (!file.exists(dataFilePath)){
  message("Downloading data from: ",params$source_url);
  download.file(params$source_url,tempFilePath)
  archive_extract(tempFilePath,"./data");
  if (!file.exists(dataFilePath)){
    stop("Data file not found");
  }
} else {
  message("Data was downloaded previously. Be careful!")
}
## Data was downloaded previously. Be careful!

2.3.2 Reading sample data

sampleRowsNo <- 1000
sample <- read.table(dataFilePath,header = TRUE, nrows = sampleRowsNo, sep = ";");

2.3.3 Dataset attributes lists

noTopClasses <- 50
#target class for classification
attrib.target_class <- "res_name"
#all attributes
attrib.all <- colnames(data)
#"local" attributes
attrib.local <- attrib.all[ grepl("local_", attrib.all)]
# dict_atom attribs
attrib.dict <- attrib.all[grepl("dict_atom_", attrib.all)]
# parts 
attrib.part <- attrib.all[grepl("part_",attrib.all)]
# skeleton
attrib.skeleton <- attrib.all[grepl("skeleton_",attrib.all)]
# resolution
attrib.res <- "resolution"
# params
attrib.params <- c("fo_col","fc_col","weight_col","grid_space","solvent_radius","solvent_opening_radius")
#uknown columns
attrib.unknown <- c("blob_coverage","blob_volume_coverage_second","resolution_max_limit","FoFc_square_std","res_coverage","res_volume_coverage","FoFc_mean","FoFc_min","blob_volume_coverage","res_volume_coverage_second","FoFc_std","FoFc_max","resolution")
# illegal attribs for classifications
attrib.illegal <- c(c(
  "title",
  "pdb_code",
  "res_name",
  "res_id",
  "chain_id",
  "local_",
  "weight_col", #is na!
  "skeleton_data"
),
#attrib.local, #local are illegal,
#attrib.dict, #dicts are illegal,
attrib.unknown,
attrib.params
)

attrib.legal<-setdiff(attrib.all, c(attrib.illegal,attrib.local, attrib.dict))

attrib.part.shape <- attrib.legal[grepl("_shape_",attrib.legal)]
attrib.part.density <- attrib.legal[grepl("_density_",attrib.legal)]

2.4 Data cleansing

2.4.1 Remove unnecesary ligands (project requirement)

excluded_names <- c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT");
data.raw <- data %>%
  filter(!res_name %in% excluded_names) %>%
  filter(!is.na(res_name))

2.4.2 Filling data gaps

replacements <<- data.raw %>% 
  select(c(attrib.legal,attrib.target_class),-c("skeleton_data")) %>% 
  na.omit %>%
  group_by(res_name) %>% 
  summarize_all(funs(mean))

ids <- data.raw[,attrib.target_class]

#?mutate
for (col in colnames(replacements)){
  column<-data.raw[,col]
  bad <- is.na(column)
  bad_ids = ids[bad]
  if (sum(bad)==0){
    next
  }
  bad_ids <- data.frame(bad_ids)
  colnames(bad_ids)<-c(attrib.target_class)
  vals <- data.frame(bad_ids) %>% left_join(replacements) %>% select(!!treat_string_as_col(col))
  new_vals <- unlist(vals, use.names = FALSE)
  data.raw[bad,col] <- new_vals
}

data.withoutGaps <- data.raw %>% replace(is.na(.),0)

2.5 Dataset description

2.5.1 Simple statistics

## Number of rows:  996
## Number of attributes:  412
## Number of legal attributes:  412

2.5.2 Attributes

##                                             vars   n          mean
## blob_coverage*                                 1 996  4.985700e+02
## res_coverage*                                  2 996  4.992900e+02
## title*                                         3 996  4.224800e+02
## pdb_code*                                      4 996  7.311000e+01
## res_name*                                      5 996  8.688000e+01
## res_id                                         6 996  7.320700e+02
## chain_id*                                      7 996  2.270000e+00
## blob_volume_coverage                           8 996  6.400000e-01
## blob_volume_coverage_second                    9 996  2.000000e-02
## res_volume_coverage                           10 996  5.100000e-01
## res_volume_coverage_second                    11 996  8.000000e-02
## local_res_atom_count                          12 996  1.489000e+01
## local_res_atom_non_h_count                    13 996  1.468000e+01
## local_res_atom_non_h_occupancy_sum            14 996  1.387000e+01
## local_res_atom_non_h_electron_sum             15 996  1.096800e+02
## local_res_atom_non_h_electron_occupancy_sum   16 996  1.025500e+02
## local_res_atom_C_count                        17 996  8.670000e+00
## local_res_atom_N_count                        18 996  1.440000e+00
## local_res_atom_O_count                        19 996  3.680000e+00
## local_res_atom_S_count                        20 996  2.200000e-01
## dict_atom_non_h_count                         21 996  1.465000e+01
## dict_atom_non_h_electron_sum                  22 996  1.095400e+02
## dict_atom_C_count                             23 996  8.600000e+00
## dict_atom_N_count                             24 996  1.390000e+00
## dict_atom_O_count                             25 996  3.810000e+00
## dict_atom_S_count                             26 996  2.200000e-01
## skeleton_data*                                27 996  5.008200e+02
## skeleton_cycle_4                              28 996  2.090000e+00
## skeleton_diameter                             29 996  2.433000e+01
## skeleton_cycle_6                              30 996  8.000000e-02
## skeleton_cycle_7                              31 996  5.000000e-02
## skeleton_closeness_006_008                    32 996  2.140000e+00
## skeleton_closeness_002_004                    33 996  2.000000e-02
## skeleton_cycle_3                              34 996  6.700000e-01
## skeleton_avg_degree                           35 996  1.570000e+00
## skeleton_closeness_004_006                    36 996  7.600000e-01
## skeleton_closeness_010_012                    37 996  3.220000e+00
## skeleton_closeness_012_014                    38 996  3.390000e+00
## skeleton_edges                                39 996  4.115000e+01
## skeleton_radius                               40 996  1.247000e+01
## skeleton_cycle_8_plus                         41 996  1.450000e+00
## skeleton_closeness_020_030                    42 996  5.770000e+00
## skeleton_deg_5_plus                           43 996  2.090000e+00
## skeleton_closeness_016_018                    44 996  2.500000e+00
## skeleton_closeness_008_010                    45 996  3.060000e+00
## skeleton_closeness_018_020                    46 996  2.130000e+00
## skeleton_average_clustering                   47 996  0.000000e+00
## skeleton_closeness_040_050                    48 996  3.340000e+00
## skeleton_closeness_014_016                    49 996  3.090000e+00
## skeleton_center                               50 996  1.800000e+00
## skeleton_closeness_000_002                    51 996  1.100000e-01
## skeleton_density                              52 996  2.300000e-01
## skeleton_closeness_030_040                    53 996  3.140000e+00
## skeleton_deg_4                                54 996  2.100000e-01
## skeleton_deg_0                                55 996  1.100000e-01
## skeleton_deg_1                                56 996  3.140000e+00
## skeleton_deg_2                                57 996  3.029000e+01
## skeleton_deg_3                                58 996  1.700000e+00
## skeleton_graph_clique_number                  59 996  1.900000e+00
## skeleton_nodes                                60 996  3.754000e+01
## skeleton_cycles                               61 996  4.610000e+00
## skeleton_cycle_5                              62 996  2.500000e-01
## skeleton_closeness_050_plus                   63 996  4.870000e+00
## skeleton_periphery                            64 996  1.970000e+00
## local_volume                                  65 996  8.316600e+02
## local_electrons                               66 996  1.890000e+01
## local_mean                                    67 996  2.000000e-02
## local_std                                     68 996  1.200000e-01
## local_min                                     69 996  0.000000e+00
## local_max                                     70 996  1.350000e+00
## local_max_over_std                            71 996  1.014000e+01
## local_skewness                                72 996  2.200000e-01
## local_cut_by_mainchain_volume                 73 996  4.900000e-01
## local_near_cut_count_C                        74 996  4.860000e+00
## local_near_cut_count_other                    75 996  3.000000e-02
## local_near_cut_count_S                        76 996  1.700000e-01
##                                                       sd        median
## blob_coverage*                              2.882600e+02  4.975000e+02
## res_coverage*                               2.886100e+02  4.985000e+02
## title*                                      2.434500e+02  4.335000e+02
## pdb_code*                                   4.264000e+01  8.100000e+01
## res_name*                                   3.889000e+01  9.100000e+01
## res_id                                      8.617600e+02  5.010000e+02
## chain_id*                                   2.160000e+00  1.500000e+00
## blob_volume_coverage                        2.500000e-01  7.000000e-01
## blob_volume_coverage_second                 5.000000e-02  0.000000e+00
## res_volume_coverage                         3.000000e-01  4.700000e-01
## res_volume_coverage_second                  2.200000e-01  0.000000e+00
## local_res_atom_count                        1.502000e+01  8.000000e+00
## local_res_atom_non_h_count                  1.480000e+01  8.000000e+00
## local_res_atom_non_h_occupancy_sum          1.444000e+01  7.000000e+00
## local_res_atom_non_h_electron_sum           9.798000e+01  6.400000e+01
## local_res_atom_non_h_electron_occupancy_sum 9.649000e+01  4.800000e+01
## local_res_atom_C_count                      1.097000e+01  4.000000e+00
## local_res_atom_N_count                      2.160000e+00  0.000000e+00
## local_res_atom_O_count                      3.570000e+00  4.000000e+00
## local_res_atom_S_count                      5.200000e-01  0.000000e+00
## dict_atom_non_h_count                       1.490000e+01  7.000000e+00
## dict_atom_non_h_electron_sum                9.855000e+01  5.600000e+01
## dict_atom_C_count                           1.109000e+01  3.000000e+00
## dict_atom_N_count                           2.160000e+00  0.000000e+00
## dict_atom_O_count                           3.570000e+00  4.000000e+00
## dict_atom_S_count                           5.100000e-01  0.000000e+00
## skeleton_data*                              2.889300e+02  5.005000e+02
## skeleton_cycle_4                            3.095000e+01  0.000000e+00
## skeleton_diameter                           2.990000e+01  1.300000e+01
## skeleton_cycle_6                            1.350000e+00  0.000000e+00
## skeleton_cycle_7                            8.300000e-01  0.000000e+00
## skeleton_closeness_006_008                  1.262000e+01  0.000000e+00
## skeleton_closeness_002_004                  5.100000e-01  0.000000e+00
## skeleton_cycle_3                            1.063000e+01  0.000000e+00
## skeleton_avg_degree                         6.900000e-01  1.860000e+00
## skeleton_closeness_004_006                  8.880000e+00  0.000000e+00
## skeleton_closeness_010_012                  9.970000e+00  0.000000e+00
## skeleton_closeness_012_014                  9.740000e+00  0.000000e+00
## skeleton_edges                              1.150300e+02  1.300000e+01
## skeleton_radius                             1.501000e+01  7.000000e+00
## skeleton_cycle_8_plus                       2.200000e+01  0.000000e+00
## skeleton_closeness_020_030                  1.219000e+01  0.000000e+00
## skeleton_deg_5_plus                         3.323000e+01  0.000000e+00
## skeleton_closeness_016_018                  6.370000e+00  0.000000e+00
## skeleton_closeness_008_010                  1.259000e+01  0.000000e+00
## skeleton_closeness_018_020                  5.560000e+00  0.000000e+00
## skeleton_average_clustering                 1.000000e-02  0.000000e+00
## skeleton_closeness_040_050                  2.793000e+01  0.000000e+00
## skeleton_closeness_014_016                  8.740000e+00  0.000000e+00
## skeleton_center                             3.980000e+00  1.000000e+00
## skeleton_closeness_000_002                  3.200000e-01  0.000000e+00
## skeleton_density                            3.100000e-01  9.000000e-02
## skeleton_closeness_030_040                  1.487000e+01  0.000000e+00
## skeleton_deg_4                              2.040000e+00  0.000000e+00
## skeleton_deg_0                              3.200000e-01  0.000000e+00
## skeleton_deg_1                              3.090000e+00  2.000000e+00
## skeleton_deg_2                              4.517000e+01  1.200000e+01
## skeleton_deg_3                              3.650000e+00  0.000000e+00
## skeleton_graph_clique_number                3.500000e-01  2.000000e+00
## skeleton_nodes                              6.088000e+01  1.400000e+01
## skeleton_cycles                             6.936000e+01  0.000000e+00
## skeleton_cycle_5                            3.850000e+00  0.000000e+00
## skeleton_closeness_050_plus                 5.540000e+00  2.000000e+00
## skeleton_periphery                          5.800000e-01  2.000000e+00
## local_volume                                1.211220e+03  3.520800e+02
## local_electrons                             2.521000e+01  8.770000e+00
## local_mean                                  2.000000e-02  2.000000e-02
## local_std                                   1.000000e-01  1.000000e-01
## local_min                                   0.000000e+00  0.000000e+00
## local_max                                   1.900000e+00  8.400000e-01
## local_max_over_std                          9.030000e+00  7.310000e+00
## local_skewness                              2.000000e-01  1.700000e-01
## local_cut_by_mainchain_volume               1.160000e+00  0.000000e+00
## local_near_cut_count_C                      5.940000e+00  3.000000e+00
## local_near_cut_count_other                  2.200000e-01  0.000000e+00
## local_near_cut_count_S                      6.600000e-01  0.000000e+00
##                                                   trimmed          mad
## blob_coverage*                               4.983300e+02 3.699100e+02
## res_coverage*                                4.991100e+02 3.706500e+02
## title*                                       4.225200e+02 3.076400e+02
## pdb_code*                                    7.410000e+01 5.486000e+01
## res_name*                                    8.846000e+01 4.300000e+01
## res_id                                       6.011800e+02 4.092000e+02
## chain_id*                                    1.760000e+00 7.400000e-01
## blob_volume_coverage                         6.500000e-01 2.500000e-01
## blob_volume_coverage_second                  0.000000e+00 0.000000e+00
## res_volume_coverage                          4.900000e-01 3.400000e-01
## res_volume_coverage_second                   1.000000e-02 0.000000e+00
## local_res_atom_count                         1.288000e+01 1.038000e+01
## local_res_atom_non_h_count                   1.263000e+01 1.038000e+01
## local_res_atom_non_h_occupancy_sum           1.166000e+01 8.900000e+00
## local_res_atom_non_h_electron_sum            9.654000e+01 5.041000e+01
## local_res_atom_non_h_electron_occupancy_sum  8.832000e+01 4.300000e+01
## local_res_atom_C_count                       6.590000e+00 5.930000e+00
## local_res_atom_N_count                       1.030000e+00 0.000000e+00
## local_res_atom_O_count                       3.080000e+00 2.970000e+00
## local_res_atom_S_count                       1.200000e-01 0.000000e+00
## dict_atom_non_h_count                        1.263000e+01 8.900000e+00
## dict_atom_non_h_electron_sum                 9.683000e+01 5.337000e+01
## dict_atom_C_count                            6.490000e+00 4.450000e+00
## dict_atom_N_count                            9.600000e-01 0.000000e+00
## dict_atom_O_count                            3.250000e+00 2.970000e+00
## dict_atom_S_count                            1.100000e-01 0.000000e+00
## skeleton_data*                               5.008100e+02 3.713900e+02
## skeleton_cycle_4                             0.000000e+00 0.000000e+00
## skeleton_diameter                            1.862000e+01 1.779000e+01
## skeleton_cycle_6                             0.000000e+00 0.000000e+00
## skeleton_cycle_7                             0.000000e+00 0.000000e+00
## skeleton_closeness_006_008                   0.000000e+00 0.000000e+00
## skeleton_closeness_002_004                   0.000000e+00 0.000000e+00
## skeleton_cycle_3                             0.000000e+00 0.000000e+00
## skeleton_avg_degree                          1.680000e+00 2.000000e-01
## skeleton_closeness_004_006                   0.000000e+00 0.000000e+00
## skeleton_closeness_010_012                   3.000000e-01 0.000000e+00
## skeleton_closeness_012_014                   6.500000e-01 0.000000e+00
## skeleton_edges                               2.369000e+01 1.779000e+01
## skeleton_radius                              9.650000e+00 8.900000e+00
## skeleton_cycle_8_plus                        0.000000e+00 0.000000e+00
## skeleton_closeness_020_030                   2.560000e+00 0.000000e+00
## skeleton_deg_5_plus                          0.000000e+00 0.000000e+00
## skeleton_closeness_016_018                   7.400000e-01 0.000000e+00
## skeleton_closeness_008_010                   0.000000e+00 0.000000e+00
## skeleton_closeness_018_020                   6.500000e-01 0.000000e+00
## skeleton_average_clustering                  0.000000e+00 0.000000e+00
## skeleton_closeness_040_050                   7.800000e-01 0.000000e+00
## skeleton_closeness_014_016                   7.400000e-01 0.000000e+00
## skeleton_center                              1.480000e+00 0.000000e+00
## skeleton_closeness_000_002                   2.000000e-02 0.000000e+00
## skeleton_density                             1.600000e-01 1.100000e-01
## skeleton_closeness_030_040                   1.180000e+00 0.000000e+00
## skeleton_deg_4                               0.000000e+00 0.000000e+00
## skeleton_deg_0                               2.000000e-02 0.000000e+00
## skeleton_deg_1                               2.630000e+00 0.000000e+00
## skeleton_deg_2                               2.023000e+01 1.779000e+01
## skeleton_deg_3                               8.200000e-01 0.000000e+00
## skeleton_graph_clique_number                 1.980000e+00 0.000000e+00
## skeleton_nodes                               2.454000e+01 1.779000e+01
## skeleton_cycles                              3.000000e-02 0.000000e+00
## skeleton_cycle_5                             0.000000e+00 0.000000e+00
## skeleton_closeness_050_plus                  4.200000e+00 2.970000e+00
## skeleton_periphery                           1.980000e+00 0.000000e+00
## local_volume                                 5.478400e+02 2.686800e+02
## local_electrons                              1.335000e+01 9.730000e+00
## local_mean                                   2.000000e-02 1.000000e-02
## local_std                                    1.100000e-01 5.000000e-02
## local_min                                    0.000000e+00 0.000000e+00
## local_max                                    1.020000e+00 5.200000e-01
## local_max_over_std                           8.410000e+00 3.950000e+00
## local_skewness                               1.900000e-01 8.000000e-02
## local_cut_by_mainchain_volume                1.700000e-01 0.000000e+00
## local_near_cut_count_C                       3.720000e+00 4.450000e+00
## local_near_cut_count_other                   0.000000e+00 0.000000e+00
## local_near_cut_count_S                       0.000000e+00 0.000000e+00
##                                                      min           max
## blob_coverage*                                      1.00  9.990000e+02
## res_coverage*                                       1.00  1.000000e+03
## title*                                              1.00  8.490000e+02
## pdb_code*                                           1.00  1.360000e+02
## res_name*                                           1.00  1.510000e+02
## res_id                                              1.00  9.002000e+03
## chain_id*                                           1.00  1.600000e+01
## blob_volume_coverage                                0.03  1.000000e+00
## blob_volume_coverage_second                         0.00  3.700000e-01
## res_volume_coverage                                 0.01  1.000000e+00
## res_volume_coverage_second                          0.00  1.000000e+00
## local_res_atom_count                                1.00  5.300000e+01
## local_res_atom_non_h_count                          1.00  5.300000e+01
## local_res_atom_non_h_occupancy_sum                  0.10  5.300000e+01
## local_res_atom_non_h_electron_sum                  12.00  3.840000e+02
## local_res_atom_non_h_electron_occupancy_sum         4.80  3.840000e+02
## local_res_atom_C_count                              0.00  3.800000e+01
## local_res_atom_N_count                              0.00  1.000000e+01
## local_res_atom_O_count                              0.00  1.700000e+01
## local_res_atom_S_count                              0.00  4.000000e+00
## dict_atom_non_h_count                               0.00  5.500000e+01
## dict_atom_non_h_electron_sum                        0.00  3.840000e+02
## dict_atom_C_count                                   0.00  5.000000e+01
## dict_atom_N_count                                   0.00  1.000000e+01
## dict_atom_O_count                                   0.00  1.700000e+01
## dict_atom_S_count                                   0.00  4.000000e+00
## skeleton_data*                                      1.00  1.000000e+03
## skeleton_cycle_4                                    0.00  6.300000e+02
## skeleton_diameter                                   0.00  1.760000e+02
## skeleton_cycle_6                                    0.00  2.700000e+01
## skeleton_cycle_7                                    0.00  1.700000e+01
## skeleton_closeness_006_008                          0.00  1.600000e+02
## skeleton_closeness_002_004                          0.00  1.600000e+01
## skeleton_cycle_3                                    0.00  1.910000e+02
## skeleton_avg_degree                                 0.00  5.980000e+00
## skeleton_closeness_004_006                          0.00  2.070000e+02
## skeleton_closeness_010_012                          0.00  6.900000e+01
## skeleton_closeness_012_014                          0.00  9.500000e+01
## skeleton_edges                                      0.00  1.996000e+03
## skeleton_radius                                     0.00  8.800000e+01
## skeleton_cycle_8_plus                               0.00  4.240000e+02
## skeleton_closeness_020_030                          0.00  8.600000e+01
## skeleton_deg_5_plus                                 0.00  6.330000e+02
## skeleton_closeness_016_018                          0.00  5.600000e+01
## skeleton_closeness_008_010                          0.00  1.090000e+02
## skeleton_closeness_018_020                          0.00  6.000000e+01
## skeleton_average_clustering                         0.00  1.200000e-01
## skeleton_closeness_040_050                          0.00  5.200000e+02
## skeleton_closeness_014_016                          0.00  9.200000e+01
## skeleton_center                                     1.00  8.400000e+01
## skeleton_closeness_000_002                          0.00  1.000000e+00
## skeleton_density                                    0.00  1.000000e+00
## skeleton_closeness_030_040                          0.00  4.300000e+02
## skeleton_deg_4                                      0.00  3.600000e+01
## skeleton_deg_0                                      0.00  1.000000e+00
## skeleton_deg_1                                      0.00  3.100000e+01
## skeleton_deg_2                                      0.00  3.200000e+02
## skeleton_deg_3                                      0.00  3.500000e+01
## skeleton_graph_clique_number                        1.00  4.000000e+00
## skeleton_nodes                                      1.00  6.680000e+02
## skeleton_cycles                                     0.00  1.329000e+03
## skeleton_cycle_5                                    0.00  6.700000e+01
## skeleton_closeness_050_plus                         0.00  2.100000e+01
## skeleton_periphery                                  1.00  1.200000e+01
## local_volume                                       97.34  9.673280e+03
## local_electrons                                     0.47  1.713300e+02
## local_mean                                          0.00  2.400000e-01
## local_std                                           0.02  9.500000e-01
## local_min                                           0.00  0.000000e+00
## local_max                                           0.19  2.716000e+01
## local_max_over_std                                  3.13  1.017200e+02
## local_skewness                                      0.04  2.360000e+00
## local_cut_by_mainchain_volume                       0.00  1.013000e+01
## local_near_cut_count_C                              0.00  4.100000e+01
## local_near_cut_count_other                          0.00  3.000000e+00
## local_near_cut_count_S                              0.00  6.000000e+00
##                                                    range  skew kurtosis
## blob_coverage*                              9.980000e+02  0.01    -1.20
## res_coverage*                               9.990000e+02  0.00    -1.20
## title*                                      8.480000e+02 -0.02    -1.18
## pdb_code*                                   1.350000e+02 -0.22    -1.34
## res_name*                                   1.500000e+02 -0.21    -0.89
## res_id                                      9.001000e+03  4.91    34.69
## chain_id*                                   1.500000e+01  2.91    10.13
## blob_volume_coverage                        9.700000e-01 -0.56    -0.83
## blob_volume_coverage_second                 3.700000e-01  3.59    13.82
## res_volume_coverage                         9.900000e-01  0.34    -1.07
## res_volume_coverage_second                  1.000000e+00  2.97     7.82
## local_res_atom_count                        5.200000e+01  1.03    -0.34
## local_res_atom_non_h_count                  5.200000e+01  1.05    -0.26
## local_res_atom_non_h_occupancy_sum          5.290000e+01  1.14     0.03
## local_res_atom_non_h_electron_sum           3.720000e+02  1.08    -0.23
## local_res_atom_non_h_electron_occupancy_sum 3.792000e+02  1.17     0.06
## local_res_atom_C_count                      3.800000e+01  1.34     0.58
## local_res_atom_N_count                      1.000000e+01  1.41     1.08
## local_res_atom_O_count                      1.700000e+01  1.51     2.56
## local_res_atom_S_count                      4.000000e+00  3.48    18.49
## dict_atom_non_h_count                       5.500000e+01  1.01    -0.31
## dict_atom_non_h_electron_sum                3.840000e+02  1.04    -0.24
## dict_atom_C_count                           5.000000e+01  1.33     0.56
## dict_atom_N_count                           1.000000e+01  1.48     1.27
## dict_atom_O_count                           1.700000e+01  1.47     2.59
## dict_atom_S_count                           4.000000e+00  3.59    19.41
## skeleton_data*                              9.990000e+02  0.00    -1.21
## skeleton_cycle_4                            6.300000e+02 16.52   281.54
## skeleton_diameter                           1.760000e+02  1.76     3.14
## skeleton_cycle_6                            2.700000e+01 16.50   279.54
## skeleton_cycle_7                            1.700000e+01 17.76   328.34
## skeleton_closeness_006_008                  1.600000e+02  7.58    65.71
## skeleton_closeness_002_004                  1.600000e+01 31.11   973.72
## skeleton_cycle_3                            1.910000e+02 15.87   252.26
## skeleton_avg_degree                         5.980000e+00 -0.33     6.04
## skeleton_closeness_004_006                  2.070000e+02 17.65   357.99
## skeleton_closeness_010_012                  6.900000e+01  3.62    13.59
## skeleton_closeness_012_014                  9.500000e+01  3.90    19.92
## skeleton_edges                              1.996000e+03 11.74   168.03
## skeleton_radius                             8.800000e+01  1.74     3.04
## skeleton_cycle_8_plus                       4.240000e+02 16.18   265.96
## skeleton_closeness_020_030                  8.600000e+01  2.29     5.09
## skeleton_deg_5_plus                         6.330000e+02 16.10   262.37
## skeleton_closeness_016_018                  5.600000e+01  3.18    12.83
## skeleton_closeness_008_010                  1.090000e+02  5.10    28.09
## skeleton_closeness_018_020                  6.000000e+01  4.08    27.02
## skeleton_average_clustering                 1.200000e-01 15.60   243.82
## skeleton_closeness_040_050                  5.200000e+02 16.46   279.26
## skeleton_closeness_014_016                  9.200000e+01  4.39    27.66
## skeleton_center                             8.300000e+01 15.24   264.47
## skeleton_closeness_000_002                  1.000000e+00  2.42     3.85
## skeleton_density                            1.000000e+00  1.58     1.17
## skeleton_closeness_030_040                  4.300000e+02 24.01   679.08
## skeleton_deg_4                              3.600000e+01 15.20   239.39
## skeleton_deg_0                              1.000000e+00  2.42     3.85
## skeleton_deg_1                              3.100000e+01  3.36    17.35
## skeleton_deg_2                              3.200000e+02  2.36     6.56
## skeleton_deg_3                              3.500000e+01  3.95    22.14
## skeleton_graph_clique_number                3.000000e+00 -0.89     7.30
## skeleton_nodes                              6.670000e+02  4.03    26.10
## skeleton_cycles                             1.329000e+03 16.12   263.60
## skeleton_cycle_5                            6.700000e+01 15.90   253.79
## skeleton_closeness_050_plus                 2.100000e+01  0.76    -0.89
## skeleton_periphery                          1.100000e+01  6.07    94.55
## local_volume                                9.575940e+03  3.25    13.18
## local_electrons                             1.708600e+02  2.48     7.20
## local_mean                                  2.400000e-01  3.51    21.67
## local_std                                   9.300000e-01  3.71    20.66
## local_min                                   0.000000e+00   NaN      NaN
## local_max                                   2.698000e+01  7.64    81.96
## local_max_over_std                          9.860000e+01  4.22    27.93
## local_skewness                              2.320000e+00  4.91    37.21
## local_cut_by_mainchain_volume               1.013000e+01  3.27    12.88
## local_near_cut_count_C                      4.100000e+01  2.13     5.99
## local_near_cut_count_other                  3.000000e+00  8.93    86.67
## local_near_cut_count_S                      6.000000e+00  5.16    31.22
##                                                       se
## blob_coverage*                              9.130000e+00
## res_coverage*                               9.150000e+00
## title*                                      7.710000e+00
## pdb_code*                                   1.350000e+00
## res_name*                                   1.230000e+00
## res_id                                      2.731000e+01
## chain_id*                                   7.000000e-02
## blob_volume_coverage                        1.000000e-02
## blob_volume_coverage_second                 0.000000e+00
## res_volume_coverage                         1.000000e-02
## res_volume_coverage_second                  1.000000e-02
## local_res_atom_count                        4.800000e-01
## local_res_atom_non_h_count                  4.700000e-01
## local_res_atom_non_h_occupancy_sum          4.600000e-01
## local_res_atom_non_h_electron_sum           3.100000e+00
## local_res_atom_non_h_electron_occupancy_sum 3.060000e+00
## local_res_atom_C_count                      3.500000e-01
## local_res_atom_N_count                      7.000000e-02
## local_res_atom_O_count                      1.100000e-01
## local_res_atom_S_count                      2.000000e-02
## dict_atom_non_h_count                       4.700000e-01
## dict_atom_non_h_electron_sum                3.120000e+00
## dict_atom_C_count                           3.500000e-01
## dict_atom_N_count                           7.000000e-02
## dict_atom_O_count                           1.100000e-01
## dict_atom_S_count                           2.000000e-02
## skeleton_data*                              9.160000e+00
## skeleton_cycle_4                            9.800000e-01
## skeleton_diameter                           9.500000e-01
## skeleton_cycle_6                            4.000000e-02
## skeleton_cycle_7                            3.000000e-02
## skeleton_closeness_006_008                  4.000000e-01
## skeleton_closeness_002_004                  2.000000e-02
## skeleton_cycle_3                            3.400000e-01
## skeleton_avg_degree                         2.000000e-02
## skeleton_closeness_004_006                  2.800000e-01
## skeleton_closeness_010_012                  3.200000e-01
## skeleton_closeness_012_014                  3.100000e-01
## skeleton_edges                              3.640000e+00
## skeleton_radius                             4.800000e-01
## skeleton_cycle_8_plus                       7.000000e-01
## skeleton_closeness_020_030                  3.900000e-01
## skeleton_deg_5_plus                         1.050000e+00
## skeleton_closeness_016_018                  2.000000e-01
## skeleton_closeness_008_010                  4.000000e-01
## skeleton_closeness_018_020                  1.800000e-01
## skeleton_average_clustering                 0.000000e+00
## skeleton_closeness_040_050                  8.900000e-01
## skeleton_closeness_014_016                  2.800000e-01
## skeleton_center                             1.300000e-01
## skeleton_closeness_000_002                  1.000000e-02
## skeleton_density                            1.000000e-02
## skeleton_closeness_030_040                  4.700000e-01
## skeleton_deg_4                              6.000000e-02
## skeleton_deg_0                              1.000000e-02
## skeleton_deg_1                              1.000000e-01
## skeleton_deg_2                              1.430000e+00
## skeleton_deg_3                              1.200000e-01
## skeleton_graph_clique_number                1.000000e-02
## skeleton_nodes                              1.930000e+00
## skeleton_cycles                             2.200000e+00
## skeleton_cycle_5                            1.200000e-01
## skeleton_closeness_050_plus                 1.800000e-01
## skeleton_periphery                          2.000000e-02
## local_volume                                3.838000e+01
## local_electrons                             8.000000e-01
## local_mean                                  0.000000e+00
## local_std                                   0.000000e+00
## local_min                                   0.000000e+00
## local_max                                   6.000000e-02
## local_max_over_std                          2.900000e-01
## local_skewness                              1.000000e-02
## local_cut_by_mainchain_volume               4.000000e-02
## local_near_cut_count_C                      1.900000e-01
## local_near_cut_count_other                  1.000000e-02
## local_near_cut_count_S                      2.000000e-02
##  [ osiągnięto getOption("max.print") -- pominięto 336 wierszy]

2.5.3 Find empty numeric attributes

stats <- data.withoutGaps %>% select(attrib.legal) %>% summarise_all(funs(min,max))
attrib.empty <- c()
for ( col in attrib.legal){
  max <- stats[1,paste(col,"_max",sep="")]
  min <- stats[1,paste(col,"_min",sep="")]
  diff <- max - min
  if (diff == 0){
    attrib.empty <- c(attrib.empty,col)
  }
}

2.6 Processing

2.6.1 Choose top 50 ligands, number of examples per class

top50 <- data.withoutGaps %>% group_by(!!treat_string_as_col(attrib.target_class)) %>% summarize(numberOfExamples = n()) %>% arrange(desc(numberOfExamples)) %>% head(noTopClasses) 
top50
## # A tibble: 50 x 2
##    res_name numberOfExamples
##    <fct>               <int>
##  1 SO4                   116
##  2 HEM                    95
##  3 GOL                    64
##  4 NAG                    54
##  5 ZN                     40
##  6 MLY                    32
##  7 MG                     30
##  8 CD                     25
##  9 K                      24
## 10 CA                     21
## # ... with 40 more rows
top50 <- top50 %>% select(c(attrib.target_class))
.topClasses<-unlist(top50[,attrib.target_class],use.names = FALSE)
data.top50 <- data.withoutGaps %>% filter( !!treat_string_as_col(attrib.target_class) %in% .topClasses )
data.withoutEmpty <- data.top50 %>% select(-attrib.empty)

2.6.2 Correlation

data.legal <- data.withoutEmpty %>% select(setdiff(attrib.legal,attrib.empty)) 
correlation <- cor(data.legal)
melted <- melt(correlation)
df.cor.mel <- data.frame(melted)  %>% mutate(value = abs(value)) %>% arrange(desc(value))

p<-ggplot(
      df.cor.mel
      ,aes(x=Var1,y=Var2, fill=value)) +
      geom_tile() +
      scale_fill_gradient(low = "white", high = "brown") +
      xlab("parameters") +
      ylab("parameters") + 
      theme(axis.text.x = element_text(angle = -90, hjust = 1))

ggplotly(p,height=700, width=700)

2.6.3 Distribution of atoms nad electrons

2.6.4 Analyze part_01 parameters

### The 10 most incompatible classes I understand by incmpatible sum(abs(local,dict))